package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.select.Elements;

import java.text.ParseException;
import java.text.SimpleDateFormat;

public class ZhongShanwang extends BreadthCrawler {
    private String seedurl = "http://www.zsnews.cn/trade.html?p=1";

    public ZhongShanwang(String crawlPath) {
        super(crawlPath, false);
        addSeed(new CrawlDatum(seedurl, "list"));
        setThreads(1);
        addRegex("http://www.zsnews.cn/trade/index/view/cateid/45/id/.*.html");
        //setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contentType = page.contentType();
        if (contentType == null) {
            return;
        }
//        if (page.url().equals("http://www.zsnews.cn/trade.html?p=1")){
//                for (int i = 2; i < 15; i++) {
//                    String url = "http://www.zsnews.cn/trade.html?p=%s";
//                    crawlDatums.add(new CrawlDatum(String.format(url, i)));
//                    System.out.println("第"+i+"url:"+String.format(url, i));
//                }
//            }
//        && page.select("div.floatL>span.pL20").text().contains("中山")
        if (page.matchType("text") ) {
            System.out.println("连接为" + page.url());
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
            try {
                System.out.println("时间为" + sdf.parse(page.select("div.floatL>span").get(0).text().substring(5)));
            } catch (ParseException e) {
                throw new RuntimeException(e);
            }
//            System.out.println("作者为" + page.select("div.floatL>span").get(1).text().substring(3));
            String author = page.select("span.pL20").text();
            System.out.println("author"+author);
            if (author.contains("来源：")){
                author = author.replace("来源：","");
            }
//            标题为：artcle-title-panel
            System.out.println("标题为："+page.select("div.artcle-title-panel>div").text());
            System.out.println("作者为" + author);
            System.out.println("来源为中山网");
            System.out.println("正文为"+page.select("div.j-content>p").text());
//        } else if (page.matchType("list")) {
        } else  {
            Elements elements = page.select("a");
            System.out.println(elements.size());
            for (int i = 0; i < elements.size(); i++) {
                System.out.println("elements:"+elements.get(i).select("a").attr("abs:href"));
                if ((elements.get(i).select("a").attr("abs:href").matches("http://www.zsnews.cn/trade/index/view/cateid/45/id/.*.html"))){
                    crawlDatums.add(new CrawlDatum(elements.get(i).select("a").attr("abs:href"), "text"));
                }
            }

        }
    }

    public static void main(String[] args) throws Exception {
        ZhongShanwang zhong = new ZhongShanwang("zhongshan");
        zhong.start(3);
    }
}
